Let’s first take a look at the variables

library(readr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
###Visualization with ggplot 
dataset <- read_csv("Exam_Score_Prediction.csv")
## Rows: 20000 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): gender, course, internet_access, sleep_quality, study_method, facil...
## dbl (6): student_id, age, study_hours, class_attendance, sleep_hours, exam_s...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
glimpse(dataset) #shows "vectors"
## Rows: 20,000
## Columns: 13
## $ student_id       <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16…
## $ age              <dbl> 17, 23, 22, 20, 20, 23, 17, 22, 18, 17, 21, 24, 22, 2…
## $ gender           <chr> "male", "other", "male", "other", "female", "male", "…
## $ course           <chr> "diploma", "bca", "b.sc", "diploma", "diploma", "b.te…
## $ study_hours      <dbl> 2.78, 3.37, 7.88, 0.67, 0.89, 3.48, 1.35, 5.48, 2.89,…
## $ class_attendance <dbl> 92.9, 64.8, 76.8, 48.4, 71.6, 65.4, 69.0, 51.1, 92.0,…
## $ internet_access  <chr> "yes", "yes", "yes", "yes", "yes", "yes", "yes", "yes…
## $ sleep_hours      <dbl> 7.4, 4.6, 8.5, 5.8, 9.8, 4.2, 7.4, 8.2, 6.6, 9.8, 5.8…
## $ sleep_quality    <chr> "poor", "average", "poor", "average", "poor", "good",…
## $ study_method     <chr> "coaching", "online videos", "coaching", "online vide…
## $ facility_rating  <chr> "low", "medium", "high", "low", "low", "low", "high",…
## $ exam_difficulty  <chr> "hard", "moderate", "moderate", "moderate", "moderate…
## $ exam_score       <dbl> 58.9, 54.8, 90.3, 29.7, 43.7, 58.2, 53.7, 47.3, 44.9,…
dim(dataset) #shows nrows and ncol 
## [1] 20000    13
str(dataset) #shows the type of variables 
## spc_tbl_ [20,000 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ student_id      : num [1:20000] 1 2 3 4 5 6 7 8 9 10 ...
##  $ age             : num [1:20000] 17 23 22 20 20 23 17 22 18 17 ...
##  $ gender          : chr [1:20000] "male" "other" "male" "other" ...
##  $ course          : chr [1:20000] "diploma" "bca" "b.sc" "diploma" ...
##  $ study_hours     : num [1:20000] 2.78 3.37 7.88 0.67 0.89 3.48 1.35 5.48 2.89 6.77 ...
##  $ class_attendance: num [1:20000] 92.9 64.8 76.8 48.4 71.6 65.4 69 51.1 92 44.8 ...
##  $ internet_access : chr [1:20000] "yes" "yes" "yes" "yes" ...
##  $ sleep_hours     : num [1:20000] 7.4 4.6 8.5 5.8 9.8 4.2 7.4 8.2 6.6 9.8 ...
##  $ sleep_quality   : chr [1:20000] "poor" "average" "poor" "average" ...
##  $ study_method    : chr [1:20000] "coaching" "online videos" "coaching" "online videos" ...
##  $ facility_rating : chr [1:20000] "low" "medium" "high" "low" ...
##  $ exam_difficulty : chr [1:20000] "hard" "moderate" "moderate" "moderate" ...
##  $ exam_score      : num [1:20000] 58.9 54.8 90.3 29.7 43.7 58.2 53.7 47.3 44.9 77.7 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   student_id = col_double(),
##   ..   age = col_double(),
##   ..   gender = col_character(),
##   ..   course = col_character(),
##   ..   study_hours = col_double(),
##   ..   class_attendance = col_double(),
##   ..   internet_access = col_character(),
##   ..   sleep_hours = col_double(),
##   ..   sleep_quality = col_character(),
##   ..   study_method = col_character(),
##   ..   facility_rating = col_character(),
##   ..   exam_difficulty = col_character(),
##   ..   exam_score = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>
summary(dataset) #shows statistics quantiles
##    student_id         age           gender             course         
##  Min.   :    1   Min.   :17.00   Length:20000       Length:20000      
##  1st Qu.: 5001   1st Qu.:18.00   Class :character   Class :character  
##  Median :10000   Median :20.00   Mode  :character   Mode  :character  
##  Mean   :10001   Mean   :20.47                                        
##  3rd Qu.:15000   3rd Qu.:22.00                                        
##  Max.   :20001   Max.   :24.00                                        
##   study_hours    class_attendance internet_access     sleep_hours   
##  Min.   :0.080   Min.   :40.60    Length:20000       Min.   :4.100  
##  1st Qu.:2.000   1st Qu.:55.10    Class :character   1st Qu.:5.500  
##  Median :4.040   Median :69.90    Mode  :character   Median :7.000  
##  Mean   :4.008   Mean   :70.02                       Mean   :7.009  
##  3rd Qu.:6.000   3rd Qu.:85.00                       3rd Qu.:8.500  
##  Max.   :7.910   Max.   :99.40                       Max.   :9.900  
##  sleep_quality      study_method       facility_rating    exam_difficulty   
##  Length:20000       Length:20000       Length:20000       Length:20000      
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##    exam_score    
##  Min.   : 19.60  
##  1st Qu.: 48.80  
##  Median : 62.60  
##  Mean   : 62.51  
##  3rd Qu.: 76.30  
##  Max.   :100.00
head(dataset) #shows titles and first rows
## # A tibble: 6 × 13
##   student_id   age gender course  study_hours class_attendance internet_access
##        <dbl> <dbl> <chr>  <chr>         <dbl>            <dbl> <chr>          
## 1          1    17 male   diploma        2.78             92.9 yes            
## 2          2    23 other  bca            3.37             64.8 yes            
## 3          3    22 male   b.sc           7.88             76.8 yes            
## 4          4    20 other  diploma        0.67             48.4 yes            
## 5          5    20 female diploma        0.89             71.6 yes            
## 6          6    23 male   b.tech         3.48             65.4 yes            
## # ℹ 6 more variables: sleep_hours <dbl>, sleep_quality <chr>,
## #   study_method <chr>, facility_rating <chr>, exam_difficulty <chr>,
## #   exam_score <dbl>
#Let's begin by taking a look at the data 
#which are the variables? 
dataset |>
  select(where(is.numeric)) |>
  names()
## [1] "student_id"       "age"              "study_hours"      "class_attendance"
## [5] "sleep_hours"      "exam_score"

Now, let’s plot

library(ggplot2)

ggplot(dataset, aes(x = study_hours, y = exam_score)) +
  geom_point(alpha = 0.3, color = "gray30", size = 1) +
  geom_smooth(method = "lm", se = FALSE) +
  labs(
    x = "Study hours",
    y = "Exam score",
    title = "Exam score vs study hours"
  ) +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

ggplot(dataset, aes(x = study_hours, y = exam_score, color = gender)) +
  geom_point(alpha = 0.6, size = 1)+
  geom_smooth(method = "lm", se = FALSE) + 
  labs(
    x = "Study hours",
    y = "Exam score",
    title = "Exam scores vs Study hours"
    ) +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

ggplot(dataset, aes(x = sleep_hours, y = exam_score)) +
  geom_point(alpha = 0.3, color = "gray30", size = 1) +
  geom_smooth(method = "lm", se = FALSE) +
  labs(
    x = "Sleep hours",
    y = "Exam score",
    title = "Exam score vs sleep hours"
  ) +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

ggplot(dataset, aes(x = class_attendance, y = exam_score)) + 
  geom_point(alpha = 0.3, color = "gray30", size = 1) +
  geom_smooth(method = "lm", se = TRUE) + 
  labs(
    x = "Class Attendance",
    y = "Exam score",
    title = "Exam score vs class attendance"
  ) + 
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

# same chart, different colors for gender and course 
# first, we need to create a factor 

dataset <- dataset |>
  mutate(
    gender = factor(gender),
    course = factor(course)
  )
# also works: dataset$gender <- factor(dataset$gender)

dataset <- dataset |> 
  mutate(
    study_method = factor(study_method)
  )

dataset <- dataset |> 
  mutate(
    sleep_quality = factor(sleep_quality)
  )

#then we plot them with different colors 
ggplot(dataset, aes(x = study_hours, y = exam_score, color = gender)) + 
  geom_point(alpha = 0.3) + 
  geom_smooth(method = "lm", se = FALSE) + 
  labs(
    x = "Study hours",
    y = "Exam score", 
    title = "Exam score vs study hours"
  ) +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

ggplot(dataset, aes(x=class_attendance, y = exam_score, color = gender)) + 
  geom_point(alpha = 0.3) + 
  geom_smooth(method = "lm", se = TRUE) + 
  labs(
    x = "Class attendance", 
    y = "Exam score", 
    title = "Exam score vs class attendance" 
  ) +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

ggplot(dataset, aes(x = study_hours, y = exam_score, color = gender)) + 
  geom_point(alpha = 0.3) + 
  geom_smooth(method = "lm", se = TRUE) + 
  labs(
    x = "Study hours",
    y = "Exam score",
    title = "Exam score vs study hours by gender"
  ) + 
  theme_minimal() 
## `geom_smooth()` using formula = 'y ~ x'

#taking a look of what is inside the variables: study_method and 
table(dataset$study_method)
## 
##      coaching   group study         mixed online videos    self-study 
##          4036          3922          3894          4069          4079
table(dataset$sleep_quality)
## 
## average    good    poor 
##    6694    6619    6687
table(dataset$course)
## 
##   b.com    b.sc  b.tech      ba     bba     bca diploma 
##    2864    2878    2798    2896    2836    2902    2826
table(dataset$internet_access)
## 
##    no   yes 
##  3012 16988
table(dataset$facility_rating)
## 
##   high    low medium 
##   6602   6638   6760
table(dataset$exam_difficulty)
## 
##     easy     hard moderate 
##     6141     3981     9878
# Let's visualize :) 
ggplot(dataset, aes(x = study_method)) + 
  geom_bar(fill = "blue") +
  labs(
    x = "Study method", 
    y = "Number of students",
    title = "Distribution of study methods"
  ) + 
  theme_minimal()

ggplot(dataset, aes(x = sleep_quality)) + 
  geom_bar(fill = "blue") + 
  labs(
    x = "Sleep quality",
    y = "number of students",
    title = "Sleep quality frequency"
  ) + 
  theme_minimal()

dataset$sleep_quality <- factor(
  dataset$sleep_quality,
  levels = c("poor", "average", "good")
)

ggplot(dataset, aes(x = sleep_quality)) + 
  geom_bar(fill = "blue") +
  labs(
    x = "Sleep quality",
    y = "Number of students",
    title = "Distribution of Students"
  ) + 
  theme_minimal()

ggplot(dataset, aes(x = study_method, y = exam_score)) + 
  geom_boxplot(fill = "grey80", color = "grey20") + 
  labs(
    x = "Study method", 
    y = "Exam score",
    title = "Exam score by study method"
  ) + 
  theme_minimal()

dataset$exam_difficulty <- factor(
  dataset$exam_difficulty,
  levels = c("easy", "moderate", "hard")
)  

ggplot(dataset, aes(x = exam_difficulty, y = exam_score)) + 
  geom_boxplot(fill = "grey80", color = "grey20") + 
  labs(
    x = "Exam difficulty",
    y = "Number of students",
    title = "Distribution of students"
  ) +
  theme_minimal()

dataset$facility_rating <- factor(
  dataset$facility_rating, 
  levels = c("low", "medium", "high")
)

ggplot(dataset, aes(x = facility_rating, y = exam_score)) + 
  geom_boxplot(fill = "grey80", color = "grey20") + 
  labs(
    x = "Facility rating",
    y = "Exam score",
    title = "Distribution of Exam score per facility rating"
  ) + 
  theme_minimal()

dataset$internet_access <- factor(dataset$internet_access)

ggplot(dataset, aes(x = exam_difficulty, y = exam_score)) + 
  geom_boxplot(fill = "grey90", color = "grey20") + 
  labs(
    x = "Exam difficulty",
    y = "Exam score",
    title = "Exam score vs exam difficulty"
  ) + 
  theme_minimal()